Prev Exercises: Udacity:DeepLearning:TensorFlow:notMNIST
notMNIST: This notebook uses the notMNIST dataset to be used with python experiments. This dataset is designed to look like the classic MNIST dataset, while looking a little more like real data: it's a harder task, and the data is a lot less 'clean' than MNIST.
import sys
print sys.version
from joblib import Parallel, delayed
import multiprocessing
nCores = multiprocessing.cpu_count() - 2 # Allow other apps to run
print 'nCores: %d' % (nCores)
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
from datetime import datetime, time
import numpy as np
import os
import pandas as pd
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from skimage import color as sk_color
from skimage import io as sk_io
from skimage import transform as sk_transform
import tarfile
%run img_utils.py
The specs should be in img_glbSpecs_SFDD
print type('string')
%run img_glbSpec_SFDD_ImgSz_64.py
#print 'glbDataFile: %s' % (glbDataFile)
print 'glbImg: %s' % (glbImg)
print 'glbRspClass: %s' % (glbRspClass)
print 'glbRspClassN: %d' % (glbRspClassN)
print 'glbPickleFile: %s' % (glbPickleFile)
# glbDataURL = 'http://yaroslavvb.com/upload/notMNIST/'
# glbImg['size'] = 32
First, we'll download the dataset to our local machine.
print type('string')
def maybe_download(url, filename, expected_bytes = None):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists('data/' + filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat('data/' + filename)
verified = False
if (expected_bytes == None):
if (statinfo.st_size > 0):
verified = True
else:
if (statinfo.st_size == expected_bytes):
verified = True
if verified:
print('Found and verified', 'data/' + filename)
else:
raise Exception(
'Failed to verify' + filename + '. Can you get to it with a browser?')
return 'data/' + filename
dataFNm = maybe_download(glbDataFile['url'], glbDataFile['filename'])
# url = 'http://yaroslavvb.com/upload/notMNIST/'
# def maybe_download(url, filename, expected_bytes):
# """Download a file if not present, and make sure it's the right size."""
# if not os.path.exists(filename):
# filename, _ = urlretrieve(url + filename, filename)
# statinfo = os.stat(filename)
# if statinfo.st_size == expected_bytes:
# print('Found and verified', filename)
# else:
# raise Exception(
# 'Failed to verify' + filename + '. Can you get to it with a browser?')
# return filename
# train_filename = maybe_download('data/notMNIST_large.tar.gz', 247336696)
# test_filename = maybe_download('data/notMNIST_small.tar.gz', 8458043)
Extract the dataset from the compressed downloaded file(s).
def extract(filename, num_classes):
print("Figure out automatically if data needs to be extracted")
return
tar = tarfile.open(filename)
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
print('Extracting data for %s. This may take a while. Please wait.' % root)
sys.stdout.flush()
tar.extractall()
tar.close()
# My edits: data_folders needs to be modified for the correct path
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root)) if d != '.DS_Store']
if len(data_folders) != num_classes:
raise Exception(
'Expected %d folders, one per class. Found %d instead.' % (
num_classes, len(data_folders)))
print(data_folders)
return data_folders
if (glbDataFile['extract']):
train_folders = extract(os.getcwd() + train_filename, glbRspClassN)
test_folders = extract(os.getcwd() + test_filename , glbRspClassN)
notMNINST:
Extraction give you a set of directories, labelled A through J.
The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the obsNewSet 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine.
Let's take a peek at some of the data to make sure it looks sensible.
print type('string')
driverDf = pd.read_csv('data/driver_imgs_list.csv')
print driverDf.describe()
# print driverDf.shape
print driverDf.head()
print driverDf.tail()
trnFoldersPth = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
newFoldersPth = os.getcwd() + '/data/' + glbDataFile['newFoldersPth']
# print(trnFoldersPth)
# print(newFoldersPth)
Collect data corrections into glbDataScrub
print type('string')
def myreadImage(filePthNm):
img = sk_io.imread(filePthNm)
try:
assert img.shape == glbImg['shape'], 'img.shape: %s' % \
(img.shape)
assert np.min(img) >= 0, 'img.min: %.4f' % \
(np.min(img))
assert np.max(img) <= glbImg['pxlDepth'], 'img.min: %.4f' % \
(np.max(img))
except AssertionError, e:
print 'filePthNm: %s' % (filePthNm)
print e
raise
return(img)
plt.imshow(myreadImage(trnFoldersPth + '/c0/img_15117.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c8/img_67168.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_84986.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_95888.jpg'))
print type('string')
smpClsImg = {}; smpN = 3
for cls in glbRspClass:
clsImg = {}
# print 'Class: %s' % (cls)
clsPth = trnFoldersPth + '/' + cls
onlyfiles = [f for f in os.listdir(clsPth)
if os.path.isfile(os.path.join(clsPth, f))]
for ix in np.random.randint(0, len(onlyfiles), size = smpN):
# print ' %s:' % (onlyfiles[ix])
# img = sk_io.imread(clsPth + '/' + onlyfiles[ix])
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
# assert np.min(img) == 0, 'img.min: %.4f' % (np.min(img))
# assert np.max(img) == glbImg['pxlDepth'], 'img.min: %.4f' % (np.max(img))
clsImg[onlyfiles[ix]] = myreadImage(clsPth + '/' + onlyfiles[ix])
# jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg',
# width = glbImg['size'] * 4, height = glbImg['size'] * 4)
# display(jpgfile)
smpClsImg[cls] = clsImg
# print smpClsImg
figs, axes = plt.subplots(len(glbRspClass), smpN,
figsize=(5 * smpN, 4 * len(glbRspClass)))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, cls in enumerate(smpClsImg.keys()):
for j, imgFileName in enumerate(smpClsImg[cls].keys()):
axes[i, j].imshow(smpClsImg[cls][imgFileName])
axes[i, j].set_title(cls + ':' + imgFileName)
print type('string')
smpSbtImg = {}; smpN = 3
for sbt in driverDf['subject'].values[
np.random.randint(0, len(driverDf['subject'].values),
size = smpN)]:
sbtImg = {}
# print ' subject: %s' % (sbt)
driverSbtDf = driverDf[driverDf['subject'] == sbt]
# print driverSbtDf.shape
clsPth = trnFoldersPth + '/' + cls
onlyfiles = [f for f in os.listdir(clsPth)
if os.path.isfile(os.path.join(clsPth, f))]
for cls in driverSbtDf['classname'].values[
np.random.randint(0, len(driverSbtDf['classname'].values),
size = smpN)]:
# print ' class: %s' % (cls)
# print " driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape = %s" % \
# (driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape)
imgFnm = driverSbtDf[driverSbtDf['classname'] == cls]['img'].iloc[0]
dctKey = cls + ':' + imgFnm
imgFnm = trnFoldersPth + '/' + cls + '/' + imgFnm
# img = sk_io.imread(imgFnm)
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
sbtImg[dctKey] = myreadImage(imgFnm)
# jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg',
# width = glbImg['size'] * 4, height = glbImg['size'] * 4)
# display(jpgfile)
smpSbtImg[sbt] = sbtImg
# print smpClsImg
nRow = smpN; nCol = smpN
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbt in enumerate(smpSbtImg.keys()):
for j, imgDesc in enumerate(smpSbtImg[sbt].keys()):
axes[i, j].imshow(smpSbtImg[sbt][imgDesc])
axes[i, j].set_title(sbt + ':' + imgDesc)
print type('string')
def mytransformImage(raw, retVals = 'final'):
assert retVals in ['final', 'each'], \
'unsupported retVals option: %s' % (retVals)
prcImgDct = {'raw': raw, 'fnl': raw.astype(float)}
fnlShape = rawShape = raw.shape
# 'crop'
if ('crop' in glbImg.keys()):
xmin = 0; xmax = rawShape[1]
ymin = 0; ymax = rawShape[0]
if ('x' in glbImg['crop'].keys()):
xmin, xmax = glbImg['crop']['x']
if ('y' in glbImg['crop'].keys()):
ymin, ymax = glbImg['crop']['y']
if retVals == 'each':
prcImgDct['crp'] = sk_transform.resize(raw[ymin : ymax,
xmin : xmax],
rawShape)
prcImgDct['fnl'] = sk_transform.resize(
prcImgDct['fnl'][ymin : ymax, xmin : xmax],
rawShape)
# 'size'
# if not glbImg['color']:
# fnlShape = (glbImg['size'], glbImg['size'], 1)
# else:
# fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])
fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])
if (rawShape != fnlShape):
if retVals == 'each':
prcImgDct['sze'] = sk_transform.resize(raw, fnlShape)
prcImgDct['fnl'] = sk_transform.resize(prcImgDct['fnl'], fnlShape)
# 'color'
if not glbImg['color']:
if retVals == 'each':
prcImgDct['gry'] = sk_color.rgb2gray(raw)
prcImgDct['fnl'] = sk_color.rgb2gray(prcImgDct['fnl'])
# 'center_scale'
if glbImg['center_scale']:
if retVals == 'each':
prcImgDct['c_s'] = (raw.astype(float) - glbImg['pxlDepth'] / 2.0) / \
glbImg['pxlDepth']
prcImgDct['fnl'] = (prcImgDct['fnl'] - glbImg['pxlDepth'] / 2.0) / \
glbImg['pxlDepth']
if retVals == 'final':
return prcImgDct['fnl']
else:
return prcImgDct
sbt = smpSbtImg.keys()[0]
tstRawImg = smpSbtImg[sbt][smpSbtImg[sbt].keys()[0]]
tstPrcImg = mytransformImage(tstRawImg, retVals = 'final')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
if (j == 0):
axes[j].imshow(tstRawImg)
axes[j].set_title('raw')
if (j == 1):
if not glbImg['color']:
plt.imshow(tstPrcImg, cmap = 'gray')
else:
plt.imshow(tstPrcImg)
axes[j].set_title('fnl')
plt.show()
tstPrcImg = mytransformImage(tstRawImg, retVals = 'each')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
if (j == 0):
axes[j].imshow(tstRawImg)
axes[j].set_title('raw')
if (j == 1):
if not glbImg['color']:
plt.imshow(tstPrcImg['fnl'], cmap = 'gray')
else:
plt.imshow(tstPrcImg['fnl'])
axes[j].set_title('fnl')
nRow = 1; nCol = len(tstPrcImg.values()) - 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(list(set(tstPrcImg.keys()) - set(['raw', 'fnl']))):
if (typImg == 'gry'):
axes[j].imshow(tstPrcImg[typImg], cmap = 'gray')
else:
axes[j].imshow(tstPrcImg[typImg])
axes[j].set_title(typImg)
print type('string')
smpSbt0Img = smpSbtImg[smpSbtImg.keys()[0]]
smpPrcImg = {}
for key, value in smpSbt0Img.items():
smpPrcImg[smpSbtImg.keys()[0] + ':' + key] = value
print 'smpPrcImg.keys(): %s' % (smpPrcImg.keys())
for key, raw in smpPrcImg.items():
prcImgDct = mytransformImage(raw, retVals = 'each')
smpPrcImg[key] = prcImgDct
# Ideally 'fnl' should be the last col in the plot
nRow = len(smpPrcImg.keys()); nCol = len(smpPrcImg.values()[0].keys())
# print 'nRow: %d; nCol: %d' % (nRow, nCol)
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbtClsImgFnm in enumerate(smpPrcImg.keys()):
for j, typImg in enumerate(smpPrcImg[sbtClsImgFnm].keys()):
if ((typImg == 'gry') or
((typImg == 'fnl') and ('gry' in smpPrcImg[sbtClsImgFnm].keys()))):
if (nRow > 1):
axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
else:
axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
else:
if (nRow > 1):
axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
else:
axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
if (nRow > 1):
axes[i, j].set_title(sbtClsImgFnm + ':' + typImg)
else:
axes[j].set_title(sbtClsImgFnm + ':' + typImg)
print type('string')
onlyfiles = [f for f in os.listdir(newFoldersPth)
if os.path.isfile(os.path.join(newFoldersPth, f))]
# print onlyfiles[:5]
smpNewImg = {}; smpN = 3
# print smpN ** 2
# print np.random.randint(0, len(onlyfiles), size = smpN ** 2)
for imgFnm in [onlyfiles[ix]
for ix in np.random.randint(0, len(onlyfiles), size = smpN ** 2)]:
# print ' imgFnm: %s' % (imgFnm)
# img = sk_io.imread(newFoldersPth + '/' + imgFnm)
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
smpNewImg[imgFnm] = myreadImage(newFoldersPth + '/' + imgFnm)
nRow = smpN; nCol = smpN
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 5 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, imgFnm in enumerate(smpNewImg.keys()):
axes[i / nCol, i % nCol].imshow(smpNewImg[imgFnm])
axes[i / nCol, i % nCol].set_title(imgFnm)
Each exemplar should be an image of a character A through J rendered in a different font.
# Display sample train images
# train_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/notMNIST_large/'
# glbImg['size'] = 28
# display(Image(train_folders_path + 'A/a2F6b28udHRm.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'B/bnVuaS50dGY=.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'C/cmlzay50dGY=.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
Now let's load the data in a more manageable format.
We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean (notMNIST only: and standard deviation ~0.5) to make training easier down the road. The labels will be stored into a separate array (notMNINST only: of integers 0 through 9.)
A few images might not be readable, we'll just skip them.
trnFolders = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
trnFolders = [trnFolders + '/' + cls for cls in glbRspClass]
print 'trnFolders: %s' % (trnFolders)
newFolders = [os.getcwd() + '/data/' + glbDataFile['newFoldersPth']]
print 'newFolders: %s' % (newFolders)
# data_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/'
# train_folders = [data_folders_path + 'notMNIST_large/' + d \
# for d in sorted(os.listdir(data_folders_path + 'notMNIST_large/')) \
# if d != '.DS_Store']
# print train_folders
# test_folders = [data_folders_path + 'notMNIST_small/' + d \
# for d in sorted(os.listdir(data_folders_path + 'notMNIST_small/')) \
# if d != '.DS_Store']
# print test_folders
#from scipy import misc as sp_misc
def load(idClass, folderPth, nImgMax, maxCheck = True, verbose = False):
assert isinstance(idClass, str), \
'expecting type(idClass) as str, not %s' % (type(idClass))
assert isinstance(folderPth, str), \
'expecting type(folderPth) as str, not %s' % (type(folderPth))
assert nImgMax > 0, \
'nImgMax: %d has to be > 0' % (nImgMax)
assert isinstance(maxCheck, bool), \
'expecting type(maxCheck) as bool, not %s' % (type(maxCheck))
startTm = datetime.now()
ids = ['' for ix in xrange(nImgMax)]
dataset = np.ndarray(
shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
# label_index = 0
try:
labelsVal = glbRspClass.index(idClass)
except ValueError, e:
print 'unknown class: %s; defaulting label to -1' % (idClass)
labelsVal = -1
except Exception, e:
print(e)
raise
labels[:] = labelsVal
image_index = 0
# if isinstance(data_folders, str):
# data_folders = [data_folders]
# for fldrIx, folder in enumerate(data_folders):
print 'Class: %s; Folder: %s' % (idClass, folderPth)
# print(os.listdir(folder)[:6])
for image in os.listdir(folderPth):
# print(image)
# print((image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))))
if maxCheck and (image_index >= nImgMax):
raise Exception('More images than expected: %d >= %d' % (
image_index, nImgMax))
# elif (image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))):
elif image_index >= nImgMax: break
image_file = os.path.join(folderPth, image)
try:
rawImg = myreadImage(image_file)
except IOError as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
next
prcImg = mytransformImage(rawImg, retVals = 'final')
# try:
# rsz_image_data = sp_misc.imresize(ndimage.imread(image_file, flatten = not glbImgColor),
# (glbImg['size'], glbImg['size']))
# image_data = (rsz_image_data.astype(float) -
# glbImgPixelDepth / 2) / glbImgPixelDepth
# if image_data.shape != (glbImg['size'], glbImg['size']):
# raise Exception('Unexpected image shape: %s' % str(image_data.shape))
ids[image_index] = image
dataset[image_index, :, :] = prcImg
# labels[image_index] = label_index
if mydspVerboseTrigger(image_index):
# print ' image_index: %d; %s:' % (image_index, image)
print ' image_index: %5d (%5d secs)' % \
(image_index, (datetime.now() - startTm).seconds)
if verbose:
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
for j, typImg in enumerate(range(0, nCol)):
if (j == 0):
axes[j].imshow(rawImg)
# axes[j].set_title(glbRspClass[label_index] + ':' + image + ':raw')
axes[j].set_title(idClass + ':' + image + ':raw')
else:
if not glbImg['color']:
axes[j].imshow(prcImg, cmap = 'gray')
else:
axes[j].imshow(prcImg)
axes[j].set_title('fnl')
# display(sp_misc.toimage(rsz_image_data))
plt.show()
image_index += 1
# label_index += 1
num_images = image_index
ids = ids[0:num_images]
dataset = dataset[0:num_images, :, :]
labels = labels[0:num_images]
# if num_images < min_num_images:
# raise Exception('Many fewer images than expected: %d < %d' % (
# num_images, min_num_images))
print(' Identifiers:', len(ids))
print(' Full dataset tensor:', dataset.shape)
print(' Mean:', np.mean(dataset))
print(' Standard deviation:', np.std(dataset))
print(' Labels:', labels.shape)
print(' Label Knts:'); print(pd.Series(labels).value_counts())
return {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}}
smpC5ObsTrnDct = load('c5', trnFolders[5], 25, maxCheck = False, verbose = True)
smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
# smqObsTrnIdn, smqObsTrnFtr, smqObsTrnRsp = load(trnFolders, 250,
# max_check = False)
# print smpObsTrnRsp.value_counts()
# print smpObsTrnIdn[10:15]
# glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = load(trnFolders, 22435)
thsBgnTm = datetime.now()
smqObsTrnLst = []
# for cls in glbRspClass[-2:]:
for cls in glbRspClass:
smqClsObsTrnDct = load(cls, trnFolders[glbRspClass.index(cls)], 25,
maxCheck = False, verbose = False)
smqObsTrnLst.append(smqClsObsTrnDct)
print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Smp Sequential load duration: %0.2f seconds' % (thsDurDff)
thsBgnTm = datetime.now()
smrObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
load)(cls, trnFolders[glbRspClass.index(cls)], 25,
maxCheck = False, verbose = False) for cls in glbRspClass)
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Smp Parallel load duration: %0.2f seconds' % (thsDurDff)
def myisEqualDct(d1, d2):
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
intersect_keys = d1_keys.intersection(d2_keys)
added = d1_keys - d2_keys
removed = d2_keys - d1_keys
# modified = {o : (d1[o], d2[o]) for o in intersect_keys if d1[o] != d2[o]}
modified = {}
for o in intersect_keys:
if not (isinstance(d1[o], dict)):
try:
eql = d1[o] == d2[o]
# eql = (d1[o] == d2[o]) if not (isinstance(d1[o], dict)) else \
# myisEqualDct(d1[o], d2[o])
except ValueError, e:
print e
print 'key: %s: type:' % (o)
print type(d1[o]).mro()
raise
else: eql = myisEqualDct(d1[o], d2[o])
if not isinstance(eql, bool):
# print 'eql:'; print eql
eql = eql.all()
if not eql: modified[o] = eql
same = set(o for o in intersect_keys if not o in modified.keys())
if (len(added) > 0):
print ' added: %s' % (added)
if (len(removed) > 0):
print ' removed: %s' % (removed)
if (len(modified) > 0):
print ' modified: %s' % (modified)
if (len(same) != len(d1_keys)):
print ' same: %s' % (same)
return ((len(added) == 0) and
(len(removed) == 0) and
(len(modified) == 0) and
(len(same) == len(d2_keys)))
tstAB1Dct = {'a': 1, 'b': 1}; tstAB2Dct = {'a': 1, 'b': 2}
print myisEqualDct(tstAB1Dct, tstAB1Dct)
print myisEqualDct(tstAB1Dct, tstAB2Dct)
tstABC1Dct = {'ab': tstAB1Dct, 'c' : 1};
tstABC2Dct = {'ab': tstAB2Dct, 'c' : 3};
print myisEqualDct(tstABC1Dct, tstABC1Dct)
print myisEqualDct(tstABC1Dct, tstABC2Dct)
print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst))
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst))
for clsIx in range(len(glbRspClass)):
# print 'clsIx: %s' % (clsIx)
# print "type(smqObsTrnLst[clsIx]['Dbs']):"
# print (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))
# print "type(smqObsTrnLst[clsIx]['Dbs']): %s" \
# (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))
# print smqObsTrnLst[clsIx]
assert myisEqualDct(smqObsTrnLst[clsIx], smrObsTrnLst[clsIx]), \
'diff in class: %s' % glbRspClass[clsIx]
print type('string')
# print 'numpy.ndarray' in type(smqObsTrnLst[9]['Dbs']['Rsp']).mro()
# print type(smqObsTrnLst[9]['Dbs']['Rsp'])
# print smqObsTrnLst[9]['Dbs']['Rsp'].shape
# print smqObsTrnLst[9]['Dbs']['Rsp']
# print type(smrObsTrnLst[9]['Dbs']['Rsp'])
# print smrObsTrnLst[9]['Dbs']['Rsp'].shape
# print smrObsTrnLst[9]['Dbs']['Rsp']
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp'])
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp']).value_counts()
tstArr = smrObsTrnLst[9]['Dbs']['Rsp']
print pd.Series(tstArr)
def mybuildDatabase(lclObsLst):
# lclObsLst dictionary structure:
# {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}
lclObsIdn = []
# assert isinstance(lclObsIdn, list), 'lclObsIdn is not a list'
# print 'type(lclObsIdn): %s' % type(lclObsIdn)
lclObsFtr = lclObsRsp = None
for clsIx in range(len(lclObsLst)):
lclObsIdn.extend(lclObsLst[clsIx]['Dbs']['Idn'])
lclObsFtr = np.vstack((lclObsFtr,
lclObsLst[clsIx]['Dbs']['Ftr'])) \
if not (lclObsFtr == None) else lclObsLst[clsIx]['Dbs']['Ftr']
lclObsRsp = np.hstack((lclObsRsp,
lclObsLst[clsIx]['Dbs']['Rsp'])) \
if not (lclObsRsp == None) else lclObsLst[clsIx]['Dbs']['Rsp']
# print lclObsIdn
return lclObsIdn, lclObsFtr, lclObsRsp
smrObsTrnIdn, smrObsTrnFtr, smrObsTrnRsp = mybuildDatabase(smrObsTrnLst)
print('Identifiers:', len(smrObsTrnIdn))
print('Sample dataset tensor:', smrObsTrnFtr.shape)
print('Mean:', np.mean(smrObsTrnFtr))
print('Standard deviation:', np.std(smrObsTrnFtr))
print('Labels:', smrObsTrnRsp.shape)
# print(smrObsTrnRsp[25:30])
print('Label Knts:'); print(pd.Series(smrObsTrnRsp).value_counts())
thsBgnTm = datetime.now()
glbObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
load)(cls, trnFolders[glbRspClass.index(cls)], 2500,
maxCheck = True, verbose = False) for cls in glbRspClass)
print 'len(glbObsTrnLst): %d' % (len(glbObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Parallel load duration: %0.2f seconds' % (thsDurDff)
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = mybuildDatabase(glbObsTrnLst)
print('Identifiers:', len(glbObsTrnIdn))
print('Full dataset tensor:', glbObsTrnFtr.shape)
print('Mean:', np.mean(glbObsTrnFtr))
print('Standard deviation:', np.std(glbObsTrnFtr))
print('Labels:', glbObsTrnRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsTrnRsp).value_counts())
Move display of train images here
Move test images to different folders to parallelize. Change newObsTrnLst to glbObsNewLst
thsBgnTm = datetime.now()
newObsTrnLst = [load('new', newFolders[0], 80000,
maxCheck = True, verbose = True)]
# smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
print 'len(newObsTrnLst): %d' % (len(newObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'newObs load duration: %0.2f seconds' % (thsDurDff)
glbObsNewLst = newObsTrnLst
glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = mybuildDatabase(glbObsNewLst)
print('Identifiers:', len(glbObsNewIdn))
print('New Full dataset tensor:', glbObsNewFtr.shape)
print('Mean:', np.mean(glbObsNewFtr))
print('Standard deviation:', np.std(glbObsNewFtr))
print('Labels:', glbObsNewRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsNewRsp).value_counts())
print glbObsTrnIdn[100:105]
glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = load(newFolders, 79726)
print glbObsNewIdn[1000:1005]
savObsNewRsp = glbObsNewRsp
glbObsNewRsp[:] = -1
print glbObsNewRsp[1000:1005]
# def load(data_folders, min_num_images, nImgMax):
# dataset = np.ndarray(
# shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
# labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
# label_index = 0
# image_index = 0
# for folder in data_folders:
# print(folder)
# for image in os.listdir(folder):
# if image_index >= nImgMax:
# raise Exception('More images than expected: %d >= %d' % (
# image_index, nImgMax))
# image_file = os.path.join(folder, image)
# try:
# image_data = (ndimage.imread(image_file).astype(float) -
# glbImgPixelDepth / 2) / glbImgPixelDepth
# if image_data.shape != (glbImg['size'], glbImg['size']):
# raise Exception('Unexpected image shape: %s' % str(image_data.shape))
# dataset[image_index, :, :] = image_data
# labels[image_index] = label_index
# image_index += 1
# except IOError as e:
# print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
# label_index += 1
# num_images = image_index
# dataset = dataset[0:num_images, :, :]
# labels = labels[0:num_images]
# if num_images < min_num_images:
# raise Exception('Many fewer images than expected: %d < %d' % (
# num_images, min_num_images))
# print('Full dataset tensor:', dataset.shape)
# print('Mean:', np.mean(dataset))
# print('Standard deviation:', np.std(dataset))
# print('Labels:', labels.shape)
# return dataset, labels
# glbObsTrnFtr, glbObsTrnRsp = load(train_folders, 450000, 550000)
# glbObsNewFtr, glbObsNewRsp = load(test_folders, 18000, 20000)
We expect the data to be balanced across classes. Verify that.
print 'glbObsTrnRsp class knts: '
print (np.unique(glbObsTrnRsp, return_counts = True))
print 'glbObsNewRsp class knts: '
print (np.unique(glbObsNewRsp, return_counts = True))
#print type(glbObsTrnRsp); print glbObsTrnRsp.shape; print glbObsTrnRsp[0:10]
# print np.sum(glbObsTrnRsp == 0)
# print np.unique(glbObsTrnRsp)
# print 'train labels freqs: %s' % \
# ([np.sum(glbObsTrnRsp == thsLabel) for thsLabel in np.unique(glbObsTrnRsp)])
Refer to glbDataScrub
Save imported data.
glbPickleFile
try:
f = open(glbPickleFile['data'], 'wb')
save = {
'glbObsTrnIdn': glbObsTrnIdn,
'glbObsTrnFtr': glbObsTrnFtr,
'glbObsTrnRsp': glbObsTrnRsp,
# 'glbObsVldFtr': glbObsVldFtr,
# 'glbObsVldRsp': glbObsVldRsp,
'glbObsNewIdn': glbObsNewIdn,
'glbObsNewFtr': glbObsNewFtr,
'glbObsNewRsp': glbObsNewRsp,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', glbPickleFile['data'], ':', e)
raise
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)
with open('data/' + glbPickleFile, 'rb') as f:
save = pickle.load(f)
# train_dataset = save['train_dataset']
# train_labels = save['train_labels']
# valid_dataset = save['valid_dataset']
# valid_labels = save['valid_labels']
glbObsNewIdn = save['glbObsNewIdn']
glbObsNewFtr = save['glbObsNewFtr']
glbObsNewRsp = save['glbObsNewRsp']
# test_dataset = save['test_dataset']
# test_labels = save['test_labels']
del save # hint to help gc free up memory
# print('Training set', train_dataset.shape, train_labels.shape)
# print('Validation set', valid_dataset.shape, valid_labels.shape)
print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape, glbObsNewRsp.shape)
Let's verify that the data still looks good. Displaying a sample of the labels and images from the ndarray.
def mydisplayImages(obsIdn, obsFtr, obsRsp):
imgIxLst = np.random.random_integers(0, obsFtr.shape[0] - 1, 10)
for imgIx in imgIxLst:
if (obsRsp[imgIx] > -1):
print ' imgIx: %d; id: %s; label: %s' % \
(imgIx, obsIdn[imgIx], glbRspClass[obsRsp[imgIx]])
else:
print ' imgIx: %d; id: %s; label: None' % (imgIx, obsIdn[imgIx])
plt.figure
plt.imshow(obsFtr[imgIx,:,:], cmap = plt.cm.gray)
plt.show()
print 'Trn set:'; mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
# dspLabels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# print 'train set:'
# imgIxLst = np.random.random_integers(0, glbObsTrnFtr.shape[0] - 1, 10)
# for imgIx in imgIxLst:
# print 'imgIx: %d: label: %s' % (imgIx, dspLabels[glbObsTrnRsp[imgIx]])
# plt.figure
# plt.imshow(glbObsTrnFtr[imgIx,:,:], cmap = plt.cm.gray)
# plt.show()
print 'New set:'; mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match.
# print type(glbObsTrnIdn)
# smpObsTrnIdn = glbObsTrnIdn[0:4]
# print smpObsTrnIdn
# print [smpObsTrnIdn[ix] for ix in [3, 1, 2, 0]]
# smpObsTrnIdn = [smpObsTrnIdn[ix] for ix in [3, 1, 2, 0]]
# print smpObsTrnIdn
np.random.seed(glbObsShuffleSeed)
def randomize(ids, dataset, labels):
permutation = np.random.permutation(labels.shape[0])
shuffled_ids = [ids[ix] for ix in permutation]
shuffled_dataset = dataset[permutation,:,:]
shuffled_labels = labels[permutation]
return shuffled_ids, shuffled_dataset, shuffled_labels
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = randomize(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
#glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = randomize(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
# np.random.seed(133)
# def randomize(dataset, labels):
# permutation = np.random.permutation(labels.shape[0])
# shuffled_dataset = dataset[permutation,:,:]
# shuffled_labels = labels[permutation]
# return shuffled_dataset, shuffled_labels
# glbObsTrnFtr, glbObsTrnRsp = randomize(glbObsTrnFtr, glbObsTrnRsp)
# glbObsNewFtr, glbObsNewRsp = randomize(glbObsNewFtr, glbObsNewRsp)
Check if data is still good after shuffling!
print 'shuffled Trn set:'; mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
#print 'shuffled New set:'; mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
Prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune obsTrnN as needed.
Also create a validation dataset for hyperparameter tuning.
obsTrnN = glbObsTrnFtr.shape[0] # or fixed number e.g. 20000
obsVldN = int(obsTrnN * 0.2)
print 'obsTrnN: %d; obsVldN: %d' % (obsTrnN, obsVldN)
glbObsVldIdn = glbObsTrnIdn[:obsVldN]
glbObsVldFtr = glbObsTrnFtr[:obsVldN,:,:]
glbObsVldRsp = glbObsTrnRsp[:obsVldN]
glbObsFitIdn = glbObsTrnIdn[obsVldN:obsVldN+obsTrnN]
glbObsFitFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
glbObsFitRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
print(' Fitting:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print('Validation:', len(glbObsVldIdn), glbObsVldFtr.shape, glbObsVldRsp.shape)
# obsTrnN = glbObsTrnFtr.shape[0]
# #obsTrnN = 200000
# obsVldN = 10000
# glbObsVldFtr = glbObsTrnFtr[:obsVldN,:,:]
# glbObsVldRsp = glbObsTrnRsp[:obsVldN]
# glbObsTrnFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsTrnRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
# print('Training', glbObsTrnFtr.shape, glbObsTrnRsp.shape)
# print('Validation', glbObsVldFtr.shape, glbObsVldRsp.shape)
print 'glbObsVldRsp class knts & Trn ratios: '
print (np.unique(glbObsVldRsp, return_counts = True))
print (np.unique(glbObsVldRsp, return_counts = True)[1] * 1.0 /
np.unique(glbObsTrnRsp, return_counts = True)[1])
Finally, let's save the data for later reuse:
Remember to save previous pickled file as '_unshuffled'
# glbPickleFile = os.getcwd() + '/data/notMNIST.pickle'
# print glbPickleFile
try:
f = open('data/' + glbPickleFile, 'wb')
save = {
'glbObsTrnIdn': glbObsTrnIdn,
'glbObsTrnFtr': glbObsTrnFtr,
'glbObsTrnRsp': glbObsTrnRsp,
'glbObsFitIdn': glbObsFitIdn,
'glbObsFitFtr': glbObsFitFtr,
'glbObsFitRsp': glbObsFitRsp,
'glbObsVldIdn': glbObsVldIdn,
'glbObsVldFtr': glbObsVldFtr,
'glbObsVldRsp': glbObsVldRsp,
'glbObsNewIdn': glbObsNewIdn,
'glbObsNewFtr': glbObsNewFtr,
'glbObsNewRsp': glbObsNewRsp,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', glbPickleFile, ':', e)
raise
statinfo = os.stat('data/' + glbPickleFile)
print('Compressed pickle size:', statinfo.st_size)
# #glbPickleFile = 'notMNIST.pickle'
# try:
# f = open(glbPickleFile, 'wb')
# save = {
# 'glbObsTrnFtr': glbObsTrnFtr,
# 'glbObsTrnRsp': glbObsTrnRsp,
# 'glbObsVldFtr': glbObsVldFtr,
# 'glbObsVldRsp': glbObsVldRsp,
# 'glbObsNewFtr': glbObsNewFtr,
# 'glbObsNewRsp': glbObsNewRsp,
# }
# pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
# f.close()
# except Exception as e:
# print('Unable to save data to', glbPickleFile, ':', e)
# raise
By construction, this dataset might contain a lot of overlapping samples, including training data that's also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it. Measure how much overlap there is between training, validation and test samples.
Optional questions:
# print glbObsTrnFtr[0:3]
# print np.ascontiguousarray(glbObsTrnFtr[0:3])
# print np.ascontiguousarray(glbObsTrnFtr[0:3]).shape
obsFitSet = set(img.tostring() for img in glbObsFitFtr)
print 'Fit: shape: %s vs. len(set): %d pctDups: %0.4f' % \
(glbObsFitFtr.shape, len(obsFitSet), \
(glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
obsVldSet = set(img.tostring() for img in glbObsVldFtr)
print 'Vld: shape: %s vs. len(set): %d pctDups: %0.4f' % \
(glbObsVldFtr.shape, len(obsVldSet), \
(glbObsVldFtr.shape[0] * 1.0 / len(obsVldSet) - 1) * 100)
obsNewSet = set(img.tostring() for img in glbObsNewFtr)
print 'New: shape: %s vs. len(set): %d pctDups: %0.4f' % \
(glbObsNewFtr.shape, len(obsNewSet), \
(glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)
#print glbObsTrnFtr[0:3]
# obsFitSet = set(img.tostring() for img in glbObsTrnFtr)
# print 'train: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsTrnFtr.shape, len(obsFitSet), \
# (glbObsTrnFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
# validSet = set(img.tostring() for img in glbObsVldFtr)
# print 'valid: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsVldFtr.shape, len(validSet), \
# (glbObsVldFtr.shape[0] * 1.0 / len(validSet) - 1) * 100)
# obsNewSet = set(img.tostring() for img in glbObsNewFtr)
# print 'test : shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsNewFtr.shape, len(obsNewSet), \
# (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)
print 'Vld set overlap with Fit set: %0.4f' % \
(len(obsVldSet.intersection(obsFitSet)) * 1.0 / len(obsVldSet))
print 'Vld set overlap with New set: %0.4f' % \
(len(obsVldSet.intersection(obsNewSet)) * 1.0 / len(obsNewSet))
print 'Fit set overlap with New set: %0.4f' % \
(len(obsFitSet.intersection(obsNewSet)) * 1.0 / len(obsFitSet))
# print ' test set overlap with train set: %0.4f' % \
# (len( obsNewSet.intersection(obsFitSet)) * 1.0 / len( obsNewSet))
# print 'valid set overlap with test set: %0.4f' % \
# (len(validSet.intersection( obsNewSet)) * 1.0 / len(validSet))
Following code is in img_02_fit_lgtRgr_SFDD
Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.
Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.
Optional question: train an off-the-shelf model on all the data!
# import graphlab
# print graphlab.version
# graphlab.canvas.set_target('ipynb')
# graphlab.logistic_classifier.create(image_train,target='label',
# features=['image_array'])
print glbObsTrnFtr[0:3,:,:]
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2]))
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])).shape
from sklearn import metrics, linear_model
import pandas as pd
def fitMdl(nFitObs = 50):
mdl = linear_model.LogisticRegression(verbose = 1)
mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
(nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), \
glbObsTrnRsp[0:nFitObs])
print mdl.get_params()
print mdl.coef_.shape
print ' coeff stats:'
for lblIx in xrange(len(dspLabels)):
print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % \
(dspLabels[lblIx], \
mdl.coef_[lblIx,:].argmin() / glbImg['size'], \
mdl.coef_[lblIx,:].argmin() % glbImg['size'], \
mdl.coef_[lblIx,:].min(), \
mdl.coef_[lblIx,:].argmax() / glbImg['size'], \
mdl.coef_[lblIx,:].argmax() % glbImg['size'], \
mdl.coef_[lblIx,:].max())
train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
(nFitObs , glbImg['size'] ** 2)))
accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
print ' accuracy train:%0.4f' % (accuracy_train)
print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)
valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, \
(glbObsVldFtr.shape[0], glbImg['size'] ** 2)))
accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
print ' accuracy valid:%0.4f' % (accuracy_valid)
print metrics.confusion_matrix(glbObsVldRsp , valid_pred_labels)
test_pred_labels = mdl.predict(np.reshape(glbObsNewFtr, \
(glbObsNewFtr.shape[0], glbImg['size'] ** 2)))
accuracy_test = metrics.accuracy_score( test_pred_labels, glbObsNewRsp)
print ' accuracy test:%0.4f' % (accuracy_test)
test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp, test_pred_labels), \
index = dspLabels, columns = dspLabels)
print test_conf
return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
mdl50 = fitMdl(nFitObs = 50)
models = pd.DataFrame({'nFitObs': [1e2, 1e3, 1e4, 1e5, glbObsTrnFtr.shape[0]]})
models = models.set_index(models['nFitObs'])
models['mdl'] = linear_model.LogisticRegression()
models['accuracy.fit'] = -1; models['accuracy.vld'] = -1; models['accuracy.new'] = -1
for thsN in models['nFitObs']:
models.ix[thsN, 'mdl'], (models.ix[thsN, 'accuracy.fit'], \
models.ix[thsN, 'accuracy.vld'], \
models.ix[thsN, 'accuracy.new'], \
) = fitMdl(nFitObs = thsN)
print models
plt.figure()
plt.plot(models['nFitObs'], models['accuracy.fit'], 'bo-', label = 'fit')
plt.plot(models['nFitObs'], models['accuracy.vld'], 'rs-', label = 'vld')
plt.plot(models['nFitObs'], models['accuracy.new'], 'gp-', label = 'new')
plt.legend()
plt.title("Accuracy")
plt.xscale('log')
axes = plt.gca()
axes.set_xlabel('nFitObs')
# axes.set_xlim([mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] / 10 ** 2, \
# mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] * 10 ** 2])
# axes.set_ylim([0, mdlDF['RSS.vld'].min() * 1.5])
plt.show()
print dspLabels
import pandas as pd